{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 模型构建 C盘\n",
    "import pandas as pd\n",
    "inputfile = 'attrsConstruction.xlsx'\n",
    "\n",
    "data = pd.read_excel(inputfile)\n",
    "df = data.iloc[:len(data)-5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(-0.56351181471562739, 0.87901557028092969, 3L, 38L, {'5%': -2.9412623574865142, '1%': -3.6155091011809297, '10%': -2.6091995013850418}, 859.99762204232331)\n",
      "0.879015570281\n",
      "原始序列经过1阶差分后归于平稳，p值为9.57297559233e-07\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Anaconda2\\lib\\site-packages\\statsmodels\\compat\\pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.\n",
      "  from pandas.core import datetools\n",
      "D:\\Anaconda2\\lib\\site-packages\\ipykernel_launcher.py:17: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
     ]
    }
   ],
   "source": [
    "# 第 * 1 * 步--C盘---------平稳性检测\n",
    "#1)平稳性检测 ：判断是否平稳，若不平稳，对其进行差分处理直至平稳\n",
    "# 方法：采用单位根检验（ADF）的方法或者时序图的方法（见数据探索模块）\n",
    "# 注意：其他平稳性检验方法见steadyCheck.py文件\n",
    "from statsmodels.tsa.stattools import adfuller as ADF\n",
    "diff = 0\n",
    "# 判断D盘数据的平稳性，以及确定几次差分后平稳\n",
    "adf = ADF(df['CWXT_DB:184:C:\\\\'])\n",
    "print adf \n",
    "\n",
    "while adf[1] >= 0.05 : # adf[1]是p值，p值小于0.05认为是平稳的\n",
    "    print adf[1]\n",
    "    diff = diff + 1\n",
    "    adf = ADF(df['CWXT_DB:184:C:\\\\'].diff(diff).dropna())#注意，差分后使用ADF检验时，必须去掉空值\n",
    "    \n",
    "print (u'原始序列经过%s阶差分后归于平稳，p值为%s') % (diff, adf[1])\n",
    "df['CWXT_DB:184:C:\\\\_adf'] = df['CWXT_DB:184:C:\\\\'].diff(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "原始序列为非白噪声序列，对应的p值为：1.06099075081e-08\n",
      "一阶差分序列为白噪声序列，对应的p值为：0.474552255255\n"
     ]
    }
   ],
   "source": [
    "# 第 * 2 * 步--C盘---------白噪声检验\n",
    "# 目的：验证序列中有用信息是否已经被提取完毕，需要进行白噪声检验。若序列是白噪声序列，说明序列中有用信息已经被提取完，只剩随机扰动\n",
    "# 方法：采用LB统计量的方法进行白噪声检验\n",
    "# 若没有通过白噪声检验，则需要进行模型识别，识别其模型属于AR、MA还是ARMA。\n",
    "\n",
    "inputfile2 = 'attrsConstruction.xlsx'\n",
    "data1 = pd.read_excel(inputfile2)\n",
    "data1 = data1.iloc[:len(data1)-5]# 不使用最后五个数据（作为预测参考）\n",
    "\n",
    "# 白噪声检测\n",
    "from statsmodels.stats.diagnostic import acorr_ljungbox\n",
    "\n",
    "[[lb], [p]] = acorr_ljungbox(data1['CWXT_DB:184:C:\\\\'], lags = 1) ## lags是残差延迟个数\n",
    "if p < 0.05:\n",
    "    print (u'原始序列为非白噪声序列，对应的p值为：%s' % p)\n",
    "else:\n",
    "    print (u'原始序列为白噪声序列，对应的p值为：%s' % p)\n",
    "\n",
    "[[lb], [p]] = acorr_ljungbox(data1['CWXT_DB:184:C:\\\\'].diff(1).dropna(), lags = 1)\n",
    "if p < 0.05:\n",
    "    print (u'一阶差分序列为非白噪声序列，对应的p值为：%s' % p)\n",
    "else:\n",
    "    print (u'一阶差分序列为白噪声序列，对应的p值为：%s' % p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Anaconda2\\lib\\site-packages\\statsmodels\\tsa\\tsatools.py:628: RuntimeWarning: overflow encountered in exp\n",
      "  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()\n",
      "D:\\Anaconda2\\lib\\site-packages\\statsmodels\\tsa\\tsatools.py:628: RuntimeWarning: invalid value encountered in divide\n",
      "  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()\n",
      "D:\\Anaconda2\\lib\\site-packages\\statsmodels\\tsa\\tsatools.py:629: RuntimeWarning: overflow encountered in exp\n",
      "  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()\n",
      "D:\\Anaconda2\\lib\\site-packages\\statsmodels\\tsa\\tsatools.py:629: RuntimeWarning: invalid value encountered in divide\n",
      "  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()\n",
      "D:\\Anaconda2\\lib\\site-packages\\statsmodels\\tsa\\tsatools.py:584: RuntimeWarning: overflow encountered in exp\n",
      "  newparams = ((1-np.exp(-params))/\n",
      "D:\\Anaconda2\\lib\\site-packages\\statsmodels\\tsa\\tsatools.py:585: RuntimeWarning: overflow encountered in exp\n",
      "  (1+np.exp(-params))).copy()\n",
      "D:\\Anaconda2\\lib\\site-packages\\statsmodels\\tsa\\tsatools.py:585: RuntimeWarning: invalid value encountered in divide\n",
      "  (1+np.exp(-params))).copy()\n",
      "D:\\Anaconda2\\lib\\site-packages\\statsmodels\\tsa\\tsatools.py:586: RuntimeWarning: overflow encountered in exp\n",
      "  tmp = ((1-np.exp(-params))/\n",
      "D:\\Anaconda2\\lib\\site-packages\\statsmodels\\tsa\\tsatools.py:587: RuntimeWarning: overflow encountered in exp\n",
      "  (1+np.exp(-params))).copy()\n",
      "D:\\Anaconda2\\lib\\site-packages\\statsmodels\\tsa\\tsatools.py:587: RuntimeWarning: invalid value encountered in divide\n",
      "  (1+np.exp(-params))).copy()\n",
      "D:\\Anaconda2\\lib\\site-packages\\statsmodels\\base\\model.py:473: HessianInversionWarning: Inverting hessian failed, no bse or cov_params available\n",
      "  'available', HessianInversionWarning)\n",
      "D:\\Anaconda2\\lib\\site-packages\\statsmodels\\base\\model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals\n",
      "  \"Check mle_retvals\", ConvergenceWarning)\n",
      "D:\\Anaconda2\\lib\\site-packages\\statsmodels\\base\\model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals\n",
      "  \"Check mle_retvals\", ConvergenceWarning)\n",
      "D:\\Anaconda2\\lib\\site-packages\\statsmodels\\tsa\\tsatools.py:612: RuntimeWarning: divide by zero encountered in divide\n",
      "  invarcoefs = -np.log((1-params)/(1+params))\n",
      "D:\\Anaconda2\\lib\\site-packages\\statsmodels\\base\\model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals\n",
      "  \"Check mle_retvals\", ConvergenceWarning)\n",
      "D:\\Anaconda2\\lib\\site-packages\\statsmodels\\base\\model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals\n",
      "  \"Check mle_retvals\", ConvergenceWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      0     1     2     3     4\n",
      "0  None  None  None  None  None\n",
      "1  None  None  None  None  None\n",
      "2  None  None  None  None  None\n",
      "3  None  None  None  None  None\n",
      "4  None  None  None  None  None\n",
      "Series([], dtype: object)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Anaconda2\\lib\\site-packages\\statsmodels\\base\\model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals\n",
      "  \"Check mle_retvals\", ConvergenceWarning)\n"
     ]
    }
   ],
   "source": [
    "# 第 * 3 * 步----------模型识别\n",
    "# 方法：采用极大似然比方法进行模型的参数估计，估计各个参数的值。\n",
    "# 然后针对各个不同模型，采用BIC信息准则对模型进行定阶，确定p,q参数，从而选择最优模型。\n",
    "# 注意，进行此步时，index需要为时间序列类型\n",
    "# 确定最佳p、d、q的值\n",
    "inputfile3 = 'attrsConstruction.xlsx'\n",
    "data2 = pd.read_excel(inputfile3,index_col='COLLECTTIME')\n",
    "xtest_value=data2['CWXT_DB:184:C:\\\\'][-5:]\n",
    "data2 = data2.iloc[:len(data2)-5]# 不使用最后五个数据（作为预测参考） \n",
    "xdata2 = data2['CWXT_DB:184:C:\\\\']\n",
    "# ARIMA（p,d,q）中,AR是自回归,p为自回归项数；MA为滑动平均,q为滑动平均项数,d为使之成为平稳序列所做的差分次数(阶数)，由前一步骤知d=1\n",
    "from statsmodels.tsa.arima_model import ARIMA#建立ARIMA（p,1，q）模型\n",
    "\n",
    "# 定阶\n",
    "# 目前选择模型常用如下准则!!!!!\n",
    "#* AIC=-2 ln(L) + 2 k 中文名字：赤池信息量 akaike information criterion (AIC)\n",
    "# * BIC=-2 ln(L) + ln(n)*k 中文名字：贝叶斯信息量 bayesian information criterion (BIC)\n",
    "# * HQ=-2 ln(L) + ln(ln(n))*k hannan-quinn criterion (HQ)\n",
    "# 增加自由参数的数目提高了拟合的优良性，\n",
    "# AIC/BIC/HQ鼓励数据拟合的优良性但是尽量避免出现过度拟合(Overfitting)的情况。所以优先考虑的模型应是AIC/BIC/HQ值最小的那一个\n",
    "\n",
    "# 方法三：HQ方式----------------------------------------------------------\n",
    "pmax = int(len(xdata2)/10) # 一般阶数不超过length/10\n",
    "qmax = int(len(xdata2)/10) # 一般阶数不超过length/10\n",
    "\n",
    "hq_matrix = [] # hq矩阵\n",
    "for p in range(pmax+1):\n",
    "    tmp = []\n",
    "    for q in range(qmax+1):\n",
    "        try:\n",
    "            print ARIMA(xdata2, (p,1,q)).fit().hq\n",
    "            tmp.append(ARIMA(xdata2, (p,1,q)).fit().hq) #存在部分为空值，会报错,所以加上异常控制\n",
    "        except:\n",
    "            tmp.append(None)\n",
    "            \n",
    "    hq_matrix.append(tmp)\n",
    "    \n",
    "hq_matrix = pd.DataFrame(hq_matrix) # 从中可以找出最小值\n",
    "print hq_matrix\n",
    "print hq_matrix.stack()\n",
    "\n",
    "# 说明：由于用HQ训练模型时，都是空值，所以，本例使用HQ不合适"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
